/***
This do file gets Q1 employment decline explained by wage growth, for above/below-median rent states
***/

*-------------------------------------------------------------------------------
* Set up
*-------------------------------------------------------------------------------

* Set $root 
project figstabs, root
if (r(buildrunning)==0) include "${root}/code/config_interactive.do"

* Set globals
project, uses("${root}/code/set_globals.do")
include "${root}/code/set_globals.do"
local category "Employment"

* Create required subfolders
cap mkdir "${root}/data/derived"
cap mkdir "${root}/data/derived/Employment"
cap mkdir "${root}/data/derived/CPS"
cap mkdir "${root}/results"
cap mkdir "${root}/results/Employment"
cap mkdir "${root}/results/paper numbers"
cap mkdir "${root}/results/paper numbers/`category'"

*-------------------------------------------------------------------------------
**# 0. Get list of states by median split of median rent 
*-------------------------------------------------------------------------------
project, uses("${root}/data/derived/ACS 2014-2018 5-Year State/ACS 2014-2018 State.dta")
use "${root}/data/derived/ACS 2014-2018 5-Year State/ACS 2014-2018 State.dta", clear 

rename (state_fips pop_2014_2018 med_2br_rent_2018) (statefip pop_2018 median_2br_rent_2018)
keep median_2br_rent_2018 pop_2018 statefip
keep if !inlist(statefip, 6, 25, 36)
gquantiles rent_quantile = median_2br_rent_2018 [w = pop_2018], xtile nq(2)

assert !mi(rent_quantile)
tempfile rent_quantile 
save `rent_quantile'

*-------------------------------------------------------------------------------
**# 1. Define and clean relevant subsample of CPS 
*-------------------------------------------------------------------------------

* We are interested in the subset of the CPS that resembles our Paychex sample (impose industry restrictions)
project, uses("${root}/data/dvc/CPS/cps_00037.dta")
use "${root}/data/dvc/CPS/cps_00037.dta", clear

* Sample restrictions
keep if age >= 16
keep if !inlist(statefip, 6, 25, 36)
			
* Create NAICS
gen naics = . 
replace naics = 11 if inrange(ind, 0170, 0290)
replace naics = 21 if inrange(ind, 0370, 0490)
replace naics = 23 if inrange(ind, 0770, 0770)
replace naics = 31 if inrange(ind, 1070, 1790)
replace naics = 32 if inrange(ind, 1870, 2590)
replace naics = 33 if inrange(ind, 2670, 3990)
replace naics = 42 if inrange(ind, 4070, 4590)
replace naics = 44 if inrange(ind, 4670, 5190)
replace naics = 45 if inrange(ind, 5275, 5790)
replace naics = 48 if inrange(ind, 6070, 6290)
replace naics = 49 if inrange(ind, 6370, 6390)
replace naics = 22 if inrange(ind, 0570, 0690)
replace naics = 51 if inrange(ind, 6470, 6780)
replace naics = 52 if inrange(ind, 6870, 6992)
replace naics = 53 if inrange(ind, 7070, 7190)
replace naics = 54 if inrange(ind, 7270, 7490)
replace naics = 55 if inrange(ind, 7570, 7570)
replace naics = 56 if inrange(ind, 7580, 7790)
replace naics = 61 if inrange(ind, 7860, 7890)
replace naics = 62 if inrange(ind, 7970, 8470)
replace naics = 71 if inrange(ind, 8560, 8590)
replace naics = 72 if inrange(ind, 8660, 8690)
replace naics = 81 if inrange(ind, 8770, 9290)
replace naics = 92 if inrange(ind, 9370, 9890)
						
* Be consistent with PIE and CES series
gen naics_code = ""
replace naics_code = "11" if naics == 11
replace naics_code = "21" if naics == 21
replace naics_code = "22" if naics == 22
replace naics_code = "23" if naics == 23
replace naics_code = "3133" if naics == 31 | naics == 32 | naics == 33
replace naics_code = "42" if naics == 42
replace naics_code = "4445" if naics == 44 | naics == 45
replace naics_code = "4849" if naics == 48 | naics == 49
replace naics_code = "51" if naics == 51
replace naics_code = "52" if naics == 52
replace naics_code = "53" if naics == 53
replace naics_code = "54" if naics == 54
replace naics_code = "55" if naics == 55
replace naics_code = "56" if naics == 56
replace naics_code = "61" if naics == 61
replace naics_code = "62" if naics == 62
replace naics_code = "71" if naics == 71
replace naics_code = "72" if naics == 72
replace naics_code = "81" if naics == 81
				
* Drop some sectors according to BLS adjustment 
drop if naics == 92 															// drop those working in public sector to match CES (Total Private Employment)
drop if naics == 11 															// drop those working in agriculture, forestry, fishing, and hunting according to BLS adjustment of CPS to CES
drop if naics == 9290 															// drop workers in private households such as nannies, housekeepers, etc.
			
* Drop some classes of workers according to BLS adjustment
drop if inlist(classwkr, 0, 13, 25, 26, 27, 28, 29) 							// drop missing (0), unincorporated, self-employed (13), and all public sector employees (25-29) 
			
* Keep those with jobs 
keep if empstat == 10
			
* Convert to super sector
gen naics_ss = ""
replace naics_ss = "10" if inlist(naics_code, "11", "21")
replace naics_ss = "20" if inlist(naics_code, "23")
replace naics_ss = "30" if inlist(naics_code, "31-33")
replace naics_ss = "40" if inlist(naics_code, "42", "44-45", "48-49", "22")
replace naics_ss = "50" if inlist(naics_code, "51")
replace naics_ss = "55" if inlist(naics_code, "52", "53")
replace naics_ss = "60" if inlist(naics_code, "54", "55", "56")
replace naics_ss = "65" if inlist(naics_code, "61", "62")
replace naics_ss = "70" if inlist(naics_code, "71", "72")
replace naics_ss = "80" if inlist(naics_code, "81")
			
* Reformat NAICS codes  
replace naics_code = subinstr(naics_code, "-", "_", .) 
			
* Define hourly wages
cap drop wage
replace earnweek = . if earnweek > 9999 
replace uhrswork1 = . if uhrswork1 > 996
replace hourwage = . if hourwage > 999
replace hourwage = earnweek / uhrswork1 if mi(hourwage) & paidhour == 2  		// if paid hourly, divide weekly earnings by amount of hours usually worked
gen wage = hourwage if paidhour == 2
replace wage = earnweek / uhrswork1 if paidhour == 1

replace wage = 100 if wage > 100 & !mi(wage)
replace wage = 5 if wage < 5

* Add random noise to integer wages to avoid issues associated with wage bunching
set seed 1280
gen random_noise = runiform(-0.5, 0.5)
replace wage = wage + random_noise if wage == round(wage, 1) 

* Date of survey
gen date = ym(year, month)
format %tm date 

* Merge in rent quantiles 
merge m:1 statefip using `rent_quantile', assert(3) nogen

tempfile cps_cleaned 
save `cps_cleaned'

*-------------------------------------------------------------------------------
**# 2. Calculate wage growth over time
*-------------------------------------------------------------------------------

* Note if you want to change this local - code further down (part 2.2) requires the base date to be <= Jul 2020
local base_date = ym(2020, 7)                    
assert `base_date' <= ym(2020, 7)                               

*-------------------------------------------------------------------------------
**## 2.1. Reweigh samples for respondent composition differences
*-------------------------------------------------------------------------------

* Since we are using the repeated cross-section aspect of the CPS rather than the panel aspect, we need to account for 
* differences in respondent composition. We do so by reweighting samples to resemble the sample in the base period. 

*-------------------------------------------------------------------------------
* Clean demographic controls 
*-------------------------------------------------------------------------------

* Education
recode educ (2/71 = 1 "No high school diploma") (73=2 "High school diploma") (81 91 92=3 "Some tertiary") (111=4 "Bachelor's") (123/125=5 "Master's or PhD"), g(educ2)

* Race 
gen race2 = 1 if race == 100 
replace race2 = 2 if race == 200
replace race2 = 3 if race == 651 
replace race2 = 4 if mi(race2) & !mi(race)
 
* Age
assert !mi(age)
gen agegrp6 = .
replace agegrp6 = 1 if age < 25 
replace agegrp6 = 2 if inrange(age, 25, 34) 
replace agegrp6 = 3 if inrange(age, 35, 44) 
replace agegrp6 = 4 if inrange(age, 45, 54) 
replace agegrp6 = 5 if inrange(age, 55, 64) 
replace agegrp6 = 6 if age >= 65

* Occupation
gen occ_3_digit = round(occ2010/10, 10)

* Hispanic 
gen hispanic = hispan != 0 if !mi(hispan)

* US-born citizen 
gen us_born_citizen = inlist(citizen, 1, 2) if !mi(citizen)

*-------------------------------------------------------------------------------
* Construct inverse probability weights to make composition of sample at date XX similar to sample in base period
*-------------------------------------------------------------------------------

foreach quantile in 1 2 {
	gen newwt_`base_date'_`quantile' = earnwt                                   // weight at base period is just CPS weights
}

forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	
	foreach quantile in 1 2 {
		
		cap drop pre 
		cap drop post 
	
		gen pre = 1 if date == `base_date' 									
		replace pre = 0 if date == `date'
	
		* controls: 1990 industry (more granular than NAICS), occupation, race, gender, age group, education level, geographic region, hispanic, US citizen 
		probit pre i.ind1990 i.occ_3_digit i.race2 i.sex i.agegrp6 i.educ2 i.region i.hispanic i.us_born_citizen if wage > 0 & !mi(wage) & rent_quantile == `quantile' [pw = earnwt], difficult iterate(300)

		cap drop phat
		predict phat
			 
		* IPW, ATT: weight = 1 for post, p/(1-p) for pre *
		gen newwt_`date'_`quantile' =  earnwt * phat / (1-phat) if pre == 0      
	}
}		

forval date = `=`base_date' + 1'/`=ym(2022, 4)' {

	foreach quantile in 1 2 {
				
		local post_period = `date'
		local pre_period = `date' - 1
		
		* Wage ventiles, estimated separately at each date
		cap drop wage_pct_post wage_pct_pre
		gquantiles wage_pct_post = wage if date == `post_period' & rent_quantile == `quantile' [pw = newwt_`post_period'_`quantile'], xtile n(20)
		gquantiles wage_pct_pre = wage if date == `pre_period' & rent_quantile == `quantile' [pw = newwt_`pre_period'_`quantile'], xtile n(20)
		
		preserve 
		keep if rent_quantile == `quantile'
		gcollapse (mean) wage [pw = newwt_`post_period'_`quantile'], by(wage_pct_post)
		forvalues ventile = 1/20 {
			qui su wage if wage_pct_post == `ventile'
			local after_wage_`date'_`ventile'_`quantile' = `r(mean)'
		}
		restore 
			
		preserve 
		keep if rent_quantile == `quantile'
		gcollapse (mean) wage [pw = newwt_`pre_period'_`quantile'], by(wage_pct_pre)
		forvalues ventile = 1/20 {
			qui su wage if wage_pct_pre == `ventile'
			local before_wage_`date'_`ventile'_`quantile' = `r(mean)'
		}
		restore 
	}
}

*-------------------------------------------------------------------------------
**## 2.2. Make date-by-ventile dataset with smoothed wage growth
*-------------------------------------------------------------------------------

clear 

* Number of ventiles
set obs 20
	
* Create identifying variables (date and ventile) 
gen date = `base_date'
format date %tm

gen ventile = . 
forvalues i = 1/20 {
	replace ventile  = `i' if _n == `i'
}

* Expand to get more dates 
expand `=ym(2022, 4)' - `base_date' + 1, gen(dup)
bys ventile dup: replace date = `base_date' + _n if dup == 1
gisid date ventile

* Gen wage_growth vars for each rent quantile
foreach quantile in 1 2 {
	gen wage_before_`quantile' = . 
	gen wage_after_`quantile' =  . 
}	
	
* Get wage ventiles
forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	forvalues ventile = 1/20 {
		foreach quantile in 1 2 {
			replace wage_before_`quantile' =  `before_wage_`date'_`ventile'_`quantile'' if date == `date' & ventile == `ventile'
			replace wage_after_`quantile'  =  `after_wage_`date'_`ventile'_`quantile'' if date == `date' & ventile == `ventile'
		}
	}
}
	
*-------------------------------------------------------------------------------
* Get smoothed month-to-month wage growth by rent quantile
*-------------------------------------------------------------------------------

cap drop *sm*

foreach quantile in 1 2 {
	gen wage_growth_sm_`quantile' = 0 
}
 
forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	foreach quantile in 1 2 {
				
		qui lowess wage_after_`quantile' ventile if date == `date', gen(afterwage_sm_`date') nograph 
		qui lowess wage_before_`quantile' ventile if date == `date', gen(beforewage_sm_`date') nograph 
		replace wage_growth_sm_`quantile' = ln(afterwage_sm_`date') - ln(beforewage_sm_`date') if date == `date'
		
		drop beforewage_sm* afterwage_sm*
	}
}

foreach quantile in 1 2 {
	assert wage_growth_sm_`quantile' == 0 if date == `base_date'
}

* Save data 
save "${root}/data/derived/Employment/wage_growth_cps updated.dta", replace
project, creates("${root}/data/derived/Employment/wage_growth_cps updated.dta")
	
*-------------------------------------------------------------------------------
* Chain month-to-month wage growth to get cumulative wage growth since base period
*-------------------------------------------------------------------------------
gisid ventile date	
sort ventile date
tsset ventile date, monthly

foreach quantile in 1 2 {
	gen product_sm_`quantile' = wage_growth_sm_`quantile' + 1
}

* Sample selection problem - we observe substantial wage growth during the worst months of the pandemic, especially in low ventiles, likely because low-wage earners are becoming unemployed
* To correct for this, we do two things. 

* First, we match the sample composition in July 2020 instead of January 2020 (implemented above) - the idea being that sample selection may be persistent from the pre-pandemic period to the post-pandemic period,
* but does not exacerbate within the post-pandemic period (post-pandemic meaning post-worst part of pandemic)

* Second, implemented here, we impute wage growth between Jan 2020 and Jul 2020 using wage growth in 2019 (see Feb 2020 BLS report on real earnings): https://www.bls.gov/news.release/archives/realer_02132020.pdf
* We use nominal wage growth because we adjust for inflation in our income quartile definitions, so we want to match nominal wages to nominal thresholds.
* Relevant estimates - nominal mean hourly wage of $28.37 in Dec 2019 and $27.58 in Jan 2019. This gives us annualized growth rate of about 3%. 
* The imputed cumulative growth rate between Jan 2020 and Jul 2020 should then be 3 * 7/12. 

foreach quantile in 1 2 {
	replace product_sm_`quantile' = 1 if inrange(date, `base_date', ym(2020, 7))
	replace product_sm_`quantile' = 1 + (28.37 - 27.58) / 27.58 * 7/12 if date == ym(2020, 7)
	bysort ventile: replace product_sm_`quantile' = product_sm_`quantile' * L.product_sm_`quantile' if !inrange(date, `base_date', ym(2020, 7))
}

* Convert date to daily format
keep date ventile product_sm* wage_growth_sm*
replace date = mdy(month(dofm(date)), 15, year(dofm(date)))
format %td date
	
* Reshape long 
greshape long product_sm_ wage_growth_sm_, i(date ventile) j(rent_quantile)
rename (product_sm_ wage_growth_sm_)(product_sm wage_growth_sm)
	
* Save 
save "${root}/data/derived/CPS/cps calculated wage cutoffs updated.dta", replace
project, creates("${root}/data/derived/CPS/cps calculated wage cutoffs updated.dta")

*-------------------------------------------------------------------------------
**# 3. Get employment in each quartile explained by wage growth
*-------------------------------------------------------------------------------

* "Explained by wage growth" means - holding total employment and respondent composition fixed, how do workers move between quartiles?
* In particular - how many workers who were Q1 in the base period move to Q2 and above by the end date, assuming they all stay employed (and there are no new entrants into employment)

use `cps_cleaned', clear				
keep if date == `base_date' 
keep if wage > 0 & !mi(wage)
	
* Get base period wage ventiles
gquantiles ventile = wage [pw = earnwt], by(rent_quantile) xtile n(20)
	
* Stack copies of the base period data - basically, we want to keep the sample in the base period fixed
preserve 

forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	replace date = `date'

	tempfile date_`date'
	save `date_`date''
}
restore
	
forval date = `=`base_date' + 1'/`=ym(2022, 4)' {
	append using `date_`date''
}
	
* Convert date to daily format
replace date = mdy(month(dofm(date)), 15, year(dofm(date)))
format %td date
	
* Merge in wage growth rates by ventile and rent quantile
project, uses("${root}/data/derived/CPS/cps calculated wage cutoffs updated.dta") derived
merge m:1 ventile date rent_quantile using "${root}/data/derived/CPS/cps calculated wage cutoffs updated.dta", assert(3) nogen
	
* Change wages using growth in ventiles 
replace wage = wage * product_sm

* Define income quartiles using federal poverty thresholds
project, uses("${root}/data/dvc/Employment/poverty_thresholds.dta")
merge m:1 date using "${root}/data/dvc/Employment/poverty_thresholds.dta", keep(3)
	
gen quartile = . 
replace quartile = 1 if wage <= poverty_100 & !mi(wage)
replace quartile = 2 if wage > poverty_100 & wage <= poverty_150 & !mi(wage)
replace quartile = 3 if wage > poverty_150 & wage <= poverty_250 & !mi(wage)
replace quartile = 4 if wage > poverty_250 & !mi(wage)	
assert !mi(quartile)
	
* Total employment at each date, rent quantile, and income quartile
gen count = 1 
gcollapse (sum) employment_cps = count  [pw = earnwt], by(date rent_quantile quartile)	

* Employment change relative to base period, in percentage points 
gen temp = employment_cps if date == mdy(month(dofm(`base_date')), 15, year(dofm(`base_date')))
gegen base = mean(temp), by(rent_quantile quartile)
assert base == temp if !mi(temp)
gen emp_wage_growth = 100 * (employment_cps / base - 1)
	
* Save 
gen month = month(date)
gen year = year(date)
keep date quartile rent_quantile emp_wage_growth year month employment_cps

save "${root}/data/derived/Employment/employment just from wage growth updated.dta", replace 
project, creates("${root}/data/derived/Employment/employment just from wage growth updated.dta")

*==============================================================================*
**# 4. Get ratio of Q1 employment explained by wage growth
*==============================================================================*

* Get Tracker data 
project, uses("${root}/data/web/data/Employment - State - Weekly.csv")
import delimited "${root}/data/web/data/Employment - State - Weekly.csv", clear
keep if !inlist(statefips, 6, 25, 36)
gen date = mdy(month, day_endofweek, year)
format %td date
	
* Keep only necessary variables and express employment changes in percentage points
keep date emp_incq* statefips 

forvalues q = 1/4{
	replace emp_incq`q' = (emp_incq`q') * 100
}
	
* Merge to population and collapse 
rename statefips statefip 
merge m:1 statefip using `rent_quantile', assert(3) nogen
gcollapse emp_incq* [w = pop_2018], by(date rent_quantile)
	
* Reshape 
greshape long emp_incq, i(date rent_quantile) j(quartile)
	
* Get month and year 
gen month = month(date)
gen year = year(date)

* Merge employment just from wage growth
project, uses("${root}/data/derived/Employment/employment just from wage growth updated.dta") derived
merge m:1 rent_quantile quartile month year using "${root}/data/derived/Employment/employment just from wage growth updated.dta", keepusing(emp_wage_growth) keep(1 3) nogen 

sort quartile date

foreach quantile in 1 2 {
	
	su emp_wage_growth if quartile == 1 & date == ${finaldate} & rent_quantile == `quantile'
	local adjust_mean = `r(mean)'
	su emp_incq if quartile == 1 & date == ${finaldate} & rent_quantile == `quantile'
	local pp = `r(mean)'

	* Proportion of Q1 decline explained by wage growth
	local prop_expl = string(100 * `adjust_mean' / `pp', "%3.0f")
	local prop_not_expl = 100 - `prop_expl'
	di in red "% Decline Explained by Wage Growth: `prop_expl'%"

	* Proportion of Q1 decline unexplained by wage growth
	local pp_expl_`quantile' : di %3.1f -1 * (`adjust_mean')
	local pp_not_expl_`quantile' : di %3.1f -1 * (`pp' - `adjust_mean')
	di `pp_expl_`quantile''
	di `pp_not_expl_`quantile''
}

*-------------------------------------------------------------------------------
* Export output numbers to csv file
*-------------------------------------------------------------------------------

cap erase "${root}/results/paper numbers/`category'/Q1 Wage Growth by Rent.yaml"

yamlout using "${root}/results/paper numbers/`category'/Q1 Wage Growth by Rent.yaml", ///
	key("expl_pp_lowrent") ///
	comment("Decline explained by wage growth (p.p.)") ///
	value(`pp_expl_1') fmt(%9.2f)
	
yamlout using "${root}/results/paper numbers/`category'/Q1 Wage Growth by Rent.yaml", ///
	key("expl_pp_highrent") ///
	comment("Decline explained by wage growth (p.p.)") ///
	value(`pp_expl_2') fmt(%9.2f)

project, creates("${root}/results/paper numbers/`category'/Q1 Wage Growth by Rent.yaml")
